capture log close
log using usa-decomp-2015-race, replace text

//  program: usa-decomp-2015-race.do
//  task:    decompose life expectancy by sex and race over time        
//  input:   allcause and cause-specific mortality
//  output:  none
//  project: life expectancy	
//  author:  sam harper \ 16feb2017


// #0
// program setup

version 14
set linesize 80
clear all
macro drop _all



// #1
// load the mortality data, downloaded from CDC WONDER database

import delimited "usa-decomp-2015-race-data.txt", ///
  encoding(ISO-8859-1)clear

* drop extra rows for Notes from CDC WONDER
drop if year==.

* fix up variable names and labels
encode gender, gen(sex)
encode race, gen(racebw)
drop race
rename racebw race
label define race 1 "NH Black" 2 "NH White", modify
label values race race

replace tenyearagegroups="01-04 years" if tenyearagegroups=="1-4 years"
replace tenyearagegroups="05-14 years" if tenyearagegroups=="5-14 years"
replace tenyearagegroups="00-01 years" if tenyearagegroups=="< 1 year"
encode tenyearagegroups, gen(age)
rename (deaths population icd10113causelist) (count pop icdcode)

* extract numeric causes of death from CDCs 113 cause list
gen cod113s=substr(icd10113causelistcode,7,3)
destring cod113s, gen(cod113)

* recode to a small number of causes (arbitrary)
recode cod113 (16 = 6 "HIV") (19 = 2 "Cancers") (46 = 3 "Diabetes") ///
  (52 = 4 "Alzheimer's") (53 = 1 "CVDs") (76 = 5 "Flu/pneumonia") ///
  (82 = 7 "Chronic Resp dx") (93 = 8 "Liver dx") (97 = 9 "Kidney dx") ///
  (114 = 10 "MV crashes") (122 = 11 "Poisoning") ///
  (124 = 12 "Suicide") (127 = 13 "Homicide") ///
  (1/15 17 18 44 45 47 50 51 79 87/92 96 102/105 108/111 115 116 118/121 ///
  123 130 131 134/136 = 14 "Residual") ///
  (20/43 48 49 54/75 77 78 80 81 83/86 94 95 98/101 106 107 112 113 117 ///
  125 126 128 129 132 133 = .), gen(cod14)

* summarize over sex, age, cause and year  
collapse (sum) count (max) pop, by(sex race age cod14 year)

* drop extra cells for causes subsumed within broad categories
* to avoid double counting
drop if cod14==.
drop if year==.


gen rate = count / pop * 100000
label var rate "death rate"
label var count "no. of deaths"
label var pop "mid-year population"

* save this dataset for life expectancy calculations (#3)
save usa-decomp-2015-race, replace



// #2
// calculate some age-adjusted death rates
use "age19std.dta", clear
label list age19
recode age19 (3/4=3) (5/6=4) (7/8=5) (9/10=6) (11/12=7) (13/14=8) ///
  (15/16=9) (17/18=10) (19=11), gen(age)
keep if std==9 // 2000 US Std Million
collapse (sum) stdcount, by(age)
egen tstdcount=sum(stdcount)
gen stdwt=stdcount/tstdcount
drop tstdcount
label var stdwt "US 2000 standard weight"

merge 1:m age using usa-decomp-2015-race
drop _merge

*create age-adjusted rates, all ages
* multiply crude age-specific rate by standard weight
gen aarate=rate*stdwt

* sum over age categories, by race, sex, cause, and year
collapse (sum) count aarate pop, by(sex race cod year)

* crude rate
gen crate=count/pop*100000
label var crate "Crude rate per 100,000 population"
label var aarate "Age-adjusted rate per 100,000 population"

* Appendix Table 1
table cod year race, c(mean crate) by(sex) format(%6.1f)
table cod year race, c(mean aarate) by(sex) format(%6.1f)


// #3
// set up for life table calculation

* sum deaths and population over causes (i.e., ignoring cause of death)
use usa-decomp-2015-race, clear
collapse (sum) count (max) pop, by(sex race age year)

* mortality rate
gen rate = count / pop * 100000
label var rate "death rate per 100,000"

* have a look at the rates by year
table age year race, c(mean rate) by(sex) format(%7.1f)

* group by sex and year for faster life table construction
egen class=group(sex race year)

* define number of years in age interval (10-year age groups)
gen n=1 if age==1
replace n=4 if age==2
replace n=10 if age>2
replace n=1 if age==11
label var n "no. years in age interval"

* average person-years contributed by those dying within interval
* assumed to be 1/2 apart from infant mortality
gen ax=0.1 if age==1 // infants
replace ax=0.5 if age>1 & age<=11 // all other age groups

* life table variables
foreach var in m q p l d L T e var_q v sv var_e se_e {
	qui gen `var'x=.
	}

* labels
label var ax "avg time contributed by deaths"
label var mx "death rate at age x"
label var qx "probability of death at age x"
label var px "probability of survival at age x"
label var lx "number alive at age x"
label var dx "expected deaths at age x"
label var Lx "person-years lived in interval"
label var Tx "time lived beyond age x"
label var ex "life expectancy at age x"
label var var_qx "variance of prob. of death"
label var vx "Chiang formula for variance"
label var svx "sum of Chiang formula"
label var var_ex "variance of life expectancy"
label var se_ex "standard error of life expectancy"



// #4
// calculate life table values by group

sort class age

qui levelsof class, local(levels)
foreach l of local levels {

	* mortality rate
	qui replace mx=count/pop if class==`l'   
	
	* probability of death	
	qui replace qx=n*mx/(1+n*(1-ax)*mx) if class==`l'
	qui replace qx = 1 if age==11 & class==`l'
	
	* conditional prob of survival
	qui replace px=1-qx if class==`l'
	
	* no alive at beginning of interval
	qui replace lx = 100000 if age==1 & class==`l'
	qui replace lx = lx[_n-1] * px[_n-1] if age>1 & class==`l'
	
	* Generate deaths by differencing the number of survivors and 
	* noting that everyone dies in the end
	qui replace dx = lx - lx[_n+1] if class==`l'
	qui replace dx = lx if age==11 & class==`l'
	
	* Compute person-years lived in each age group
	* n for those who survive the age group and nax for those who die
	qui replace Lx = n * (lx[_n+1] + (ax*dx)) if class==`l'
	qui replace Lx = lx/mx if age==11 & class==`l'
	

	/* Accumulating from the bottom up is a bit tricky because Stata likes 
	to sum from the top down. You could sort the data from oldest to 
	youngest, sum, and then sort again. I will subtract the cumulative 
	sum from the total.*/
	qui sum Lx if class==`l'
	qui replace Tx = r(sum) - sum(Lx) + Lx if class==`l'
	
	
	* Compute life expectancy 
	*(time lived after each age / survivors to that age)
	qui replace ex = Tx/lx if class==`l'
	
	* variance of cond. probability of death
	qui replace var_qx = [n^2 * mx*(1-ax*n*mx)] / [pop*(1+(1-ax)*n*mx)^3] if class==`l'
	qui replace var_qx = 0 in -1 if class==`l'


	* calculate second part of Chiang formula for variance of LE [add cite] 
	qui replace vx = (lx^2)*[((1-ax)*n+ex[_n+1])^2]*var_qx if class==`l'
	qui replace vx = 0 in -1 if class==`l'
	
	* sum of vx
	qui sum vx if class==`l'
	qui replace svx = r(sum) - sum(vx) + vx if class==`l'
	
	* variance and se of life expectancy
	qui replace var_ex = svx / lx^2 if class==`l'
	qui replace se_ex = sqrt(var_ex) if class==`l'
	}


* specify a few formats
format %6.3f ax ex var_ex se_ex
format %8.6f mx qx px
format %9.0fc pop count lx dx Lx Tx

* table of life expectancies by year
table race year sex if age==1, c(mean ex) format(%4.2f)


// #5
// Decompose by age

// drop unnecessary variables and reshape the data to wide format with
// rows for each sex year age and colums for each race group

keep lx Tx Lx mx sex race year age
reshape wide lx Tx Lx mx, i(sex race age) j(year)

/* decompose LE by age, using formulas from Arriaga (1984) 
	Measuring and explaining the change in life expectancies. 
	Demography 1984;21: 83-96. */

* generate direct effect
gen de=(lx2015/100000) * ((Lx2014/lx2014) - (Lx2015/lx2015))
label var de "direct effect"

* generate indirect effect and interaction term
gen ie=(Tx2014[_n+1]/100000) * ///
  ((lx2015/lx2014) - (lx2015[_n+1]/lx2014[_n+1])) if age!=11
replace ie=0 if age==11
label var ie "indirect effect+interact"

* total effect (direct + indirect + interaction)
* contribution in years of LE
gen te=de+ie
label var te "diff in life exp"

drop lx* Lx* Tx*

* reshape dataset to wide format to calculate total
reshape wide de ie te mx2014 mx2015, i(sex race) j(age)

foreach var of newlist de ie te {
	egen `var'12 = rsum(`var'*) // sum across age groups
	}

* reshape dataset back to long
reshape long de ie te mx2014 mx2015, i(sex race) j(age)

* total across all age groups
label define age 12 "Total", add
label values age age

* proportional contribution
sort sex race age
bysort sex race: gen pctgap=te[_n] / te[12]

table age race, c(sum te) by(sex)
table age race, c(sum pctgap) by(sex)


egen class=group(sex race)
label define class 1 "Non-Hispanic Black Women" ///
  2 "Non-Hispanic White Women" 3 "Non-Hispanic Black Men" ///
  4 "Non-Hispanic White Men", modify
label values class class

* save this as a dataset for plotting in R
saveold age-race, replace version(12)


// #6
// estimate cause-specific proportion of deaths

* load the mortality data
use usa-decomp-2015-race, clear

* calculate proportion of deaths for each cause by sex, year age
rename cod14 cod
drop pop rate
reshape wide count, i(sex race age cod) j(year)

* now reshape wide again to get deaths by cause as variables
reshape wide count2014 count2015, i(sex race age) j(cod)

* total deaths for each group
foreach v of numlist 2014 2015 {
  egen tdeaths`v' = rsum(count`v'*)
}

* proportion of deaths for each cause, by age, year, race
forvalues i=1/14 {
  gen pdeaths2014`i' = count2014`i' / tdeaths2014
  gen pdeaths2015`i' = count2015`i' / tdeaths2015
}

* save dataset for merging with age-decompositions
save cod-race, replace


// #7
// now decomposition by age and cause of death

* load the age decomposition
use age-race, clear
drop if age==12 // drop total for all ages

* merge with proportion of deaths by cause
merge 1:1 sex race age using cod-race
drop _merge

/* formula for partitioning of each age group component by cause of death
  from Arriaga EE. Changing trends in mortality decline during the last
  decades. In Ruzicka et al. Differential mortality: Methodological issues 
  and biosocial factors. 1989;p. 105–29.*/

local i = 1
while `i' < 15 {
	gen cause`i' = te* (((mx2014*pdeaths2014`i') - (mx2015*pdeaths2015`i')) ///
	/ (mx2014-mx2015))
	local ++i
}

* drop proportions of deaths by cause
drop count* pdeaths*

* reshape long by cause
reshape long cause, i(sex race age) j(cod)

rename cause cont
label var cont "contribution to LE gap"


* proportional contribution to change in the gap
egen gap=total(cont), by(sex race)
gen pctgapc=cont/gap

table cod race, contents(sum pctgapc) row by(sex)

collapse (sum) cont, by(sex race cod)

reshape wide cont, i(sex race) j(cod)
egen cont15 = rsum(cont*)
reshape long cont, i(sex race) j(cod)

label define cod 15 "Total", add

* proportional contribution
sort sex race cod
bysort sex race: gen pctgapc=cont[_n] / cont[15]

egen class=group(sex race)
label define class 1 "Non-Hispanic Black Women" ///
  2 "Non-Hispanic White Women" 3 "Non-Hispanic Black Men" ///
  4 "Non-Hispanic White Men", modify
label values class class

* save dataset for plotting in R
saveold cod-race-plots, replace version(12)


log close
exit
